In [1]:
from sentence_transformers import SentenceTransformer
# import pyreadr
import os
import pandas as pd
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
import textwrap
import itables
from sklearn.cluster import AgglomerativeClustering

itables.init_notebook_mode(all_interactive=True, connected=False)

Semantic Similarity Across Theoretical Constructs¶

Data Transformation¶

In [2]:
# Activate the automatic conversion of R objects to pandas objects
pandas2ri.activate()

# Expand the user path if needed
rdata_path = os.path.expanduser("~/Documents/Coding/SAA_OA/python_interfacing_ver.RData")

# Load the file using the R function load()
robjects.r['load'](rdata_path)

# List all objects currently in the R global environment
loaded_objects = robjects.r.ls()
# print("Loaded R objects:", list(loaded_objects))


# Replace 'my_data' with the actual name of the object loaded from the RData file
r_constructs = robjects.r['constructs']

# Convert to a pandas DataFrame
constructs = pandas2ri.rpy2py(r_constructs)

# Now you can work with the DataFrame in Python
print(constructs.head())

print(constructs['description'])
  project                                        description  narrative_count  \
1      OA  HP originates in parental representations and ...             30.0   
2      OA                         HP is an attachment figure             32.0   
3      OA  The quality of one’s HP relationship influence...             24.0   
4      OA  Construction of a reliable HP is a pathway for...             35.0   
5     SAA            Spirituality reshapes behavioral schema             29.0   

   RT_count hierarchy  TC  
1      56.0       TC1   1  
2      83.0       TC2   2  
3      47.0       TC3   3  
4      86.0       TC4   4  
5      73.0       TC1   1  
1    HP originates in parental representations and ...
2                           HP is an attachment figure
3    The quality of one’s HP relationship influence...
4    Construction of a reliable HP is a pathway for...
5              Spirituality reshapes behavioral schema
6    Identity integrated with spirituality influenc...
7                           HP is an attachment figure
8                  Spirituality shapes social identity
Name: description, dtype: object

Load a pretrained Model

In [3]:
# 1. Load a pretrained Sentence Transformer model
# model = SentenceTransformer("all-MiniLM-L6-v2") 5 times fasters
model = SentenceTransformer("all-mpnet-base-v2")
In [4]:
constructs = constructs.rename(columns={"project": "program"})
# Add a new column 'graph_label' combining 'description' and 'project' in the format of (<program>)
constructs['graph_label'] = constructs['description'] + " (" + constructs['program'] + ")"
print(constructs[['description', 'graph_label']])
                                         description  \
1  HP originates in parental representations and ...   
2                         HP is an attachment figure   
3  The quality of one’s HP relationship influence...   
4  Construction of a reliable HP is a pathway for...   
5            Spirituality reshapes behavioral schema   
6  Identity integrated with spirituality influenc...   
7                         HP is an attachment figure   
8                Spirituality shapes social identity   

                                         graph_label  
1  HP originates in parental representations and ...  
2                    HP is an attachment figure (OA)  
3  The quality of one’s HP relationship influence...  
4  Construction of a reliable HP is a pathway for...  
5      Spirituality reshapes behavioral schema (SAA)  
6  Identity integrated with spirituality influenc...  
7                   HP is an attachment figure (SAA)  
8          Spirituality shapes social identity (SAA)  

Performing Semantic Similarity Analysis¶

In [5]:
# Assuming 'constructs["description"]' is your list of sentences
sentences = constructs['description']

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
(8, 768)
tensor([[1.0000, 0.4711, 0.2740, 0.2876, 0.2060, 0.1137, 0.4711, 0.0822],
        [0.4711, 1.0000, 0.3047, 0.2424, 0.0912, 0.1205, 1.0000, 0.0803],
        [0.2740, 0.3047, 1.0000, 0.7878, 0.3226, 0.4944, 0.3047, 0.2786],
        [0.2876, 0.2424, 0.7878, 1.0000, 0.2760, 0.3242, 0.2424, 0.1612],
        [0.2060, 0.0912, 0.3226, 0.2760, 1.0000, 0.5964, 0.0912, 0.6619],
        [0.1137, 0.1205, 0.4944, 0.3242, 0.5964, 1.0000, 0.1205, 0.6947],
        [0.4711, 1.0000, 0.3047, 0.2424, 0.0912, 0.1205, 1.0000, 0.0803],
        [0.0822, 0.0803, 0.2786, 0.1612, 0.6619, 0.6947, 0.0803, 1.0000]])
In [6]:
# If 'similarities' is a PyTorch tensor, convert it to a NumPy array first:
similarity_matrix = similarities.detach().cpu().numpy()

# Create a DataFrame with sentences as both index and columns:
labels = constructs['graph_label']
similarity_df = pd.DataFrame(similarity_matrix, index=labels, columns=labels)



itables.show(similarity_df, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
graph_label HP originates in parental representations and can be modified (OA) HP is an attachment figure (OA) The quality of one’s HP relationship influences food addiction recovery (OA) Construction of a reliable HP is a pathway for food addiction recovery (OA) Spirituality reshapes behavioral schema (SAA) Identity integrated with spirituality influences sex addiction (SAA) HP is an attachment figure (SAA) Spirituality shapes social identity (SAA)
graph_label
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Visualization¶

In [7]:
%config InlineBackend.figure_format = 'svg'

import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
import numpy as np

# Mask the upper triangle and diagonal of the heatmap
mask = np.triu(np.ones_like(similarity_df, dtype=bool))

# Create the figure and heatmap, and get the current Axes object
plt.figure(figsize=(10, 10))
ax = sns.heatmap(similarity_df,mask=mask, annot=True, fmt=".2f", cmap="Blues", 
                 cbar=True, linewidths=0.5)

plt.title("Semantic Similarity of Theoretical Constructs")
# Remove the axis titles
ax.set_xlabel('')
ax.set_ylabel('')
# Wrap and rotate x-axis labels:
xlabels = ax.get_xticklabels()
wrapped_xlabels = [textwrap.fill(label.get_text(), width=40) for label in xlabels]
ax.set_xticklabels(wrapped_xlabels, rotation=45, ha='right') # Rotate x labels

# For y-axis: Wrap the text with a width of 25.
ylabels = ax.get_yticklabels()
wrapped_ylabels = [textwrap.fill(label.get_text(), width=25) for label in ylabels]
ax.set_yticklabels(wrapped_ylabels, rotation=0)  # Keep y labels horizontal

plt.tight_layout()  # Adjust layout to fit labels properly
plt.show()
No description has been provided for this image

Key Similarities between Theoretical Constructs¶

  • (2,7) = 1.00
    • "HP is an attachment figure (OA)" vs. "HP is an attachment figure (SAA)"
    • Essentially identical, indicating both programs use the same language and meaning around HP-as-attachment.
  • (3,4) = 0.82
    • "The quality of one’s HP relationship influences food addiction recovery (OA)" vs. "Construction of a reliable HP is a pathway for food addiction recovery (OA)"
    • Indicates strong conceptual overlap; both constructs center on HP’s trustworthiness and its role in supporting recovery.
  • (5,8) = 0.71
    • "Spirituality reshapes behavioral schema (SAA)" vs. "Spirituality shapes social identity (SAA)"
    • Shows a close connection between reconfiguring behaviors and reshaping social identity through spirituality in SAA.
  • (5,6) = 0.56
    • "Spirituality reshapes behavioral schema (SAA)" vs. "Identity integrated with spirituality influences sex addiction (SAA)"
    • Reflects moderate alignment, indicating that spiritual transformation of behavior and its integration into identity are closely related in SAA.
  • (1,2) = 0.54 and (1,7) = 0.54
    • (1,2): "HP originates in parental representations and can be modified (OA)" vs. "HP is an attachment figure (OA)"
    • (1,7): "HP originates in parental representations and can be modified (OA)" vs. "HP is an attachment figure (SAA)"
    • These values indicate a moderate similarity, suggesting that understanding HP as influenced by parental models moderately aligns with viewing HP as an attachment figure.
  • (3,6) = 0.35
    • "The quality of one’s HP relationship influences food addiction recovery (OA)" vs. "Identity integrated with spirituality influences sex addiction (SAA)"
    • Shows some conceptual resonance between HP relational quality and spiritual identity integration, though less pronounced than within-program comparisons.

Lower Similarities¶

  • Many cross-program comparisons fall in the 0.05–0.30 range, reflecting differing emphases between OA (focused on developmental/attachment aspects) and SAA (focused on identity and behavior change).
  • Some pairs show near-zero similarity (e.g., around 0.02–0.03), highlighting fundamental differences where the concepts diverge strongly.

Overall Observations¶

  • Within-program constructs share the highest similarities.
    • In OA, the relational quality and reliable construction of HP (0.82) are closely linked.
    • In SAA, constructs related to behavioral schema and social identity (0.71 and 0.56) show a strong connection.
  • The exact phrasing of "HP is an attachment figure" in OA and SAA (2,7) aligns perfectly (1.00), indicating shared conceptualization despite different program contexts.
  • Cross-program comparisons generally show lower similarity scores, reflecting that OA’s focus on developmental and attachment models differs from SAA’s emphasis on identity transformation through spirituality.

Semantic Similarity Across Relevant Texts under Theoretical Constructs¶

Rationale is to check the similarity of the relevant texts of 2 similar or related theoretical constructs and themes (esp. across programs)

Data Cleaning¶

In [8]:
# Load Relevant Text Data

# Replace 'my_data' with the actual name of the object loaded from the RData file
r_RT_data = robjects.r['RT_data']

# Convert to a pandas DataFrame
RT_data = pandas2ri.rpy2py(r_RT_data)

# Extract TC and Theme with Orphan handling
# Extract the numeric part following 'TC' for the TC column
RT_data['TC'] = RT_data['code'].apply(
    lambda x: int(x.split('>')[1].strip()[2:]) if 'Orphan' not in x else np.nan
)

# Extract the numeric part following 'T' for the theme column
RT_data['theme'] = RT_data['code'].apply(
    lambda x: int(x.split('>')[2].strip()[1:]) if 'Orphan' not in x else np.nan
)

RT_data['ID'] = range(1, len(RT_data)+1)

RT_SAA = RT_data[RT_data['program'] == 'SAA'].reset_index(drop=True)
RT_OA = RT_data[RT_data['program'] == 'OA'].reset_index(drop=True)



# Now you can work with the DataFrame in Python
itables.show(RT_data.head(), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
code RT characters Beginning End narrative_num RI program coverage TC theme ID
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)
In [9]:
# Load Hierarchy Data

# Replace 'my_data' with the actual name of the object loaded from the RData file
r_hierarchy = robjects.r['hierarchy']

# Convert to a pandas DataFrame
hierarchy = pandas2ri.rpy2py(r_hierarchy)

# Now you can work with the DataFrame in Python
hierarchy.head()

hierarchy = hierarchy.applymap(lambda x: None if isinstance(x, (int, float)) and x < 0 else x)

# Replace NaN values with 0 before converting to integers
hierarchy = hierarchy.fillna(0)

hierarchy = hierarchy.apply(lambda col: col.astype(int) if col.dtypes == 'float64' else col)
# Replace 0 values back to NaN after converting to integers
hierarchy = hierarchy.replace(0, np.nan)
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/677195587.py:12: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  hierarchy = hierarchy.applymap(lambda x: None if isinstance(x, (int, float)) and x < 0 else x)

Semantic Similarities in Relevant Texts under the Theoretical Construct "HP is an attachment figure" in both OA and SAA¶

In Hierarchy, search for theoretical constructs "HP is an attachment figure" in both programs, find the TC value and get all the RTs under the TCs

In [10]:
# Filter RT_data based on hierarchy and search terms

def filter_rt_by_category(hierarchy_df, rt_data_df, search_list, category):
    """
    Filters the RT_data DataFrame based on search terms found in the hierarchy DataFrame.

    Parameters:
    - hierarchy_df: DataFrame containing hierarchy information.
    - rt_data_df: DataFrame containing the RT (relevant texts) data.
    - search_list: List of strings to search for in the 'description' column of hierarchy_df.
    - category: A string indicating which category to use. Allowed values are "TC", "theme", or "RI".

    Returns:
    - A DataFrame containing the filtered RT_data rows.
    """
    filtered_hierarchy = hierarchy_df[hierarchy_df['description'].isin(search_list)]
    filtered_rt_data = pd.DataFrame()

    for _, row in filtered_hierarchy.iterrows():
        program = row['program']

        if category == "TC":
            # Get the TC value and construct the code text
            code_fragment = f"TC{int(row['TC'])}"
        elif category == "theme":
            # Assumes a column 'theme' exists with numeric values
            code_fragment = f"T{int(row['theme'])}"
        elif category == "RI":
            # Assumes a column 'RI' exists with numeric values
            code_fragment = f"RI{int(row['RI'])}"
        else:
            raise ValueError("Category must be one of: 'TC', 'theme', or 'RI'.")

        # Filter RT_data for matching code and program
        filtered_rt = rt_data_df[
            (rt_data_df['code'].str.contains(code_fragment)) &
            (rt_data_df['program'] == program)
        ]
        filtered_rt_data = pd.concat([filtered_rt_data, filtered_rt])
        # Add a new column 'label' combining 'description' and 'project' in the format of (<program>)
        filtered_rt_data['label'] = filtered_rt_data['RT'] + " (" + filtered_rt_data['program'] + ")"
        # print(filtered_rt_data[['RT', 'label']])

    return filtered_rt_data

# Example usage:
# search_terms = ["HP is an attachment figure"]
# rt_data_filtered = filter_rt_by_category(hierarchy, RT_data, search_terms, "TC")
# print(rt_data_filtered)

# Filter RT_data for TC values
search_terms = ["HP is an attachment figure"]
rt_data_filtered = filter_rt_by_category(hierarchy, RT_data, search_terms, "TC")
# Display the filtered RTs
print(rt_data_filtered)
                      code                                                 RT  \
321   OA > TC2 > T3 > RI26  Then without my control, without me doing anyt...   
322   OA > TC2 > T3 > RI26  This time was different. My mind had nothing t...   
323    OA > TC2 > T8 > RI8  My Higher Power sent me the message: I was as ...   
324   OA > TC2 > T3 > RI26  I’m very grateful that God has removed the obs...   
325   OA > TC2 > T3 > RI37  It took me so long to pick up the set of spiri...   
..                     ...                                                ...   
281  SAA > TC3 > T8 > RI38  or the first three steps over and over, like a...   
285  SAA > TC3 > T8 > RI38  It became more and more important to admit the...   
295  SAA > TC3 > T6 > RI35  Another gift of the program is the emphasis on...   
297  SAA > TC3 > T9 > RI33  I had a sense that the group was right for me....   
316  SAA > TC3 > T8 > RI29  Since the day I admitted my powerlessness and ...   

     characters  Beginning   End  narrative_num  RI program  coverage   TC  \
321       189.0       10.0  10.0              7  26      OA  0.037052  2.0   
322        70.0       10.0  10.0              7  26      OA  0.013723  2.0   
323        86.0       11.0  11.0             21   8      OA  0.015297  2.0   
324        53.0       20.0  20.0             21  26      OA  0.009427  2.0   
325       114.0       21.0  21.0             21  37      OA  0.020278  2.0   
..          ...        ...   ...            ...  ..     ...       ...  ...   
281        54.0        7.0   7.0             41  38     SAA  0.004676  3.0   
285       173.0       13.0  13.0             41  38     SAA  0.014980  3.0   
295       300.0       16.0  16.0              6  35     SAA  0.036670  3.0   
297       337.0        9.0   9.0              3  33     SAA  0.044671  3.0   
316       146.0        5.0   5.0             17  29     SAA  0.037932  3.0   

     theme   ID                                              label  
321    3.0  321  Then without my control, without me doing anyt...  
322    3.0  322  This time was different. My mind had nothing t...  
323    8.0  323  My Higher Power sent me the message: I was as ...  
324    3.0  324  I’m very grateful that God has removed the obs...  
325    3.0  325  It took me so long to pick up the set of spiri...  
..     ...  ...                                                ...  
281    8.0  281  or the first three steps over and over, like a...  
285    8.0  285  It became more and more important to admit the...  
295    6.0  295  Another gift of the program is the emphasis on...  
297    9.0  297  I had a sense that the group was right for me....  
316    8.0  316  Since the day I admitted my powerlessness and ...  

[154 rows x 13 columns]

Perform Semantic Similarity Analysis

In [11]:
def compute_similarities(sentences, model):
    """
    Compute embeddings for a series of sentences and return their similarity matrix.

    Parameters:
    - sentences: A list or pandas Series of text strings.
    - model: A pretrained SentenceTransformer model.

    Returns:
    - embeddings: A numpy array of sentence embeddings.
    - similarities: A torch Tensor containing the similarity scores.
    """
    embeddings = model.encode(sentences)
    # print("Embeddings shape:", embeddings.shape)  # For verification
    similarities = model.similarity(embeddings, embeddings)
    return embeddings, similarities

# Compute the similarities for the filtered RT data
embeddings, similarities = compute_similarities(rt_data_filtered['RT'], model)
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
In [12]:
def create_similarity_df(similarities, label_series):
    """
    Convert a similarity tensor/array to a DataFrame with given labels as index and columns.

    Parameters:
    - similarities: a torch.Tensor or a NumPy array of similarity values.
    - label_series: a list-like object (e.g., pandas Series) containing labels.

    Returns:
    - A pandas DataFrame with similarity values and labels as both index and columns.
    """
    # Convert to a numpy array if similarities is a torch.Tensor
    if hasattr(similarities, "detach"):
        sim_matrix = similarities.detach().cpu().numpy()
    else:
        sim_matrix = similarities
    similarity_temp = pd.DataFrame(sim_matrix, index=label_series, columns=label_series) # still in matrix like dataframe
    return matrix_to_df(similarity_temp)

def matrix_to_df(sim_df): # sub function of the function above
    # Create a mask for the upper triangle (excluding the diagonal)
    mask = np.triu_indices_from(sim_df, k=1)
    # Use the mask to get sentence pairs and their corresponding similarity values
    top_pairs = pd.DataFrame({
        'Sentence A': sim_df.index[mask[0]],
        'Sentence B': sim_df.columns[mask[1]],
        'Similarity Score': sim_df.values[mask]
    })
    return top_pairs

# Create the similarity DataFrame using the function
sim_df = create_similarity_df(similarities, rt_data_filtered['label'])

itables.show(sim_df, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
Sentence A Sentence B Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)
In [13]:
# Filter the DataFrame to keep only the OA-SAA pairs
# and remove duplicates
def process_oa_saa_pairs(df):
    """
    Process a DataFrame to filter, deduplicate, and standardize OA-SAA sentence pairs.
    
    Args:
    df (pd.DataFrame): Input DataFrame with 'Sentence A' and 'Sentence B' columns.
    
    Returns:
    pd.DataFrame: Processed DataFrame with 'OA' and 'SAA' columns.
    """
    # Create ordered pairs for deduplication
    df['Ordered Pair'] = df.apply(lambda row: tuple(sorted([row['Sentence A'], row['Sentence B']])), axis=1)
    
    # Filter for OA-SAA pairs and deduplicate
    mask = ((df['Sentence A'].str.contains('(OA)', regex=False) & df['Sentence B'].str.contains('(SAA)', regex=False)) |
            (df['Sentence A'].str.contains('(SAA)', regex=False) & df['Sentence B'].str.contains('(OA)', regex=False)))
    df = df[mask].drop_duplicates(subset='Ordered Pair').reset_index(drop=True)
    
    # Ensure OA sentences are in the first column
    swap_mask = ~df['Sentence A'].str.contains('(OA)', regex=False)
    df.loc[swap_mask, ['Sentence A', 'Sentence B']] = df.loc[swap_mask, ['Sentence B', 'Sentence A']].values
    
    # Rename columns and drop 'Ordered Pair'
    df = df.rename(columns={'Sentence A': 'OA', 'Sentence B': 'SAA'}).drop(columns='Ordered Pair')
    
    return df


def run_rt_similarity_analysis(hierarchy_df, rt_data_df, search_list, category, model): # Master function
    rt_data_filtered = filter_rt_by_category(hierarchy_df, rt_data_df, search_list, category)
    embeddings, similarities = compute_similarities(rt_data_filtered['RT'], model)
    sim_df = create_similarity_df(similarities, rt_data_filtered['label'])
    oa_saa_pairs = process_oa_saa_pairs(sim_df).sort_values('Similarity Score', ascending=False)
    return oa_saa_pairs


search_terms = ["HP is an attachment figure"]
# Call the function using the previously defined search_terms
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)
# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Semantic Similarities in Relevant Texts under the Theoretical Constructs about relational quality (OA) and social identity reconstruction (SAA)¶

  1. The quality of one’s HP relationship influences food addiction recovery (OA)
  2. Spirituality shapes social identity (SAA)
In [14]:
# Filter RT_data for TC values
search_terms = ["The quality of one’s HP relationship influences food addiction recovery",
                "Spirituality shapes social identity"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)

# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Semantic Similarities in Relevant Texts under the Theoretical Constructs about HP reliability (OA) and reshaping behavioral schema (SAA)¶

  1. Construction of a reliable HP is a pathway for food addiction recovery (OA)
  2. Spirituality reshapes behavioral schema (SAA)
In [15]:
# Filter RT_data for TC values
search_terms = ["Construction of a reliable HP is a pathway for food addiction recovery",
                "Spirituality reshapes behavioral schema"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)

# Display the processed DataFrame
itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Semantic Similarities/Differences in Relevant Texts under the Theoretical Constructs that are different¶

  1. HP originates in parental representations and can be modified (OA)
  2. Identity integrated with spirituality influences sex addiction (SAA)
In [16]:
# Filter RT_data for TC values
search_terms = ["HP originates in parental representations and can be modified",
                "Identity integrated with spirituality influences sex addiction"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "TC", model)

# Display the processed DataFrame
# oa_saa_pairs.head(20)

itables.show(oa_saa_pairs.head(20), buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Semantic Similarity Across Relevant Texts under Themes¶

Surrender as Entry Point¶

  • OA-TC2-T10: Surrender, acceptance, and trust in HP lead to recovery.
  • SAA-TC3-T9: Key steps toward admitting powerlessness and surrendering.
In [17]:
# Filter RT_data for TC values
search_terms = ["Surrender, acceptance, and trust in HP lead to recovery",
                "Key steps toward admitting powerlessness and surrendering"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Higher Power as Behavioral Regulator¶

OA SAA
TC2-T3 ("HP acts as an agent of support"):- RI16: Shifting dependence to HP cures compulsions- RI37: HP provides power to recover TC1-T2 ("Spirituality helps me abstain from sexual behaviors."):- RI18: HP removes triggers/harmful environments- RI17: Mental influences for abstinence
In [18]:
# Filter RT_data for TC values
search_terms = ["HP acts as an agent of support",
                "Spirituality helps me abstain from sexual behaviors."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Identity Reformation¶

OA-TC3-T11 ("Having a HP stabilizes one’s inner world"):

  • RI19: “Spiritual Awakening” changes behaviors and identity
  • RI27: Recovery starts with understanding self-worth

SAA-TC2-T4 ("Spirituality transformed parts of my identity to resolve my sex addiction."):

  • RI41: Spiritual and sexual identity integration
  • RI22: Guidance from God/HP
In [19]:
# Filter RT_data for TC values
search_terms = ["Having a HP stabilizes one’s inner world",
                "Spirituality transformed parts of my identity to resolve my sex addiction."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Spirituality can have negative effects if misunderstood or misapplied.¶

  • SAA: Spirituality is more harmful than good (5.2)
  • OA: An uninformed HP can be a negative influence on recovery (2)
In [20]:
# Filter RT_data for TC values
search_terms = ["Spirituality is more harmful than good in my life.",
                "An uninformed HP can be a negative influence on one’s recovery"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Spirituality as Social Support¶

  • HP is cocreated from meaningful relationships and experiences (OA)
  • Bidirectional influence of social support and spirituality (social support ↔ Spirituality) (SAA)
In [21]:
# Filter RT_data for TC values
search_terms = ["HP is cocreated from meaningful relationships and experiences",
                "Bidirectional influence of social support and spirituality (social support ↔ Spirituality)"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Comparison 2:

  • HP acts as an agent of support (OA)
  • Bidirectional influence of social support and spirituality (social support ↔ Spirituality) (SAA)
In [35]:
# Filter RT_data for TC values
search_terms = ["HP acts as an agent of support",
                "Bidirectional influence of social support and spirituality (social support ↔ Spirituality)"]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

The comfort and companionship provided by spirituality/HP¶

  • HP fulfills one’s need for comfort and acceptance (OA)
  • Thanks to spirituality, I do not have to fight sex addiction alone. (SAA)

The similarity score of sentences is lower than most other pairs.

In [34]:
# Filter RT_data for TC values
search_terms = ["HP fulfills one’s need for comfort and acceptance",
                "Thanks to spirituality, I do not have to fight sex addiction alone."]
oa_saa_pairs = run_rt_similarity_analysis(hierarchy, RT_data, search_terms, "theme", model)

# Display the processed DataFrame

itables.show(oa_saa_pairs, buttons=["copyHtml5", "csvHtml5", "excelHtml5"], classes=["display", "cell-border"]) # actively remove nowrap class
/opt/homebrew/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py:521: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
OA SAA Similarity Score
Loading ITables v2.2.5 from the init_notebook_mode cell... (need help?)

Hierarchical Clustering (Forced Grounded Theory Analysis)¶

Semi-Supervised process

45, 15, 5 for the number of RI, themes, and TCs

In [22]:
# Calculate the embeddings for the OA and SAA sentences
OA_embeddings = model.encode(RT_OA['RT'])
SAA_embeddings = model.encode(RT_SAA['RT'])

# Check the shape of the embeddings (for debugging)

SAA (doesn't make a lot of sense)¶

Level 1 (Repeating Ideas)¶

In [23]:
# Set a distance threshold to control cluster granularity.
# You may need to adjust the threshold based on your data characteristics.
# distance_threshold = None  # Set to None for automatic clustering

clustering_ri = AgglomerativeClustering(
    n_clusters=47,  # Set the number of clusters, 47 is currently the optimal number
    distance_threshold=None, # only used if n_clusters is None
    linkage='ward'
)
RI_labels = clustering_ri.fit_predict(SAA_embeddings)
RT_SAA['RI_cluster'] = RI_labels

# Validate clusters: remove clusters if the cluster has fewer than 2 texts
# or if texts come from the same narrative
valid_ri_clusters = []
for cluster in RT_SAA['RI_cluster'].unique():
    cluster_data = RT_SAA[RT_SAA['RI_cluster'] == cluster]
    if len(cluster_data) >= 2 and cluster_data['narrative_num'].nunique() >= 2:
        valid_ri_clusters.append(cluster)

RT_SAA_valid = RT_SAA[RT_SAA['RI_cluster'].isin(valid_ri_clusters)]
print("Valid RI clusters:", valid_ri_clusters)
print("Valid number of RI clusters:", len(valid_ri_clusters)) # 40
print("Valid number of RTs categorized under RIs", len(RT_SAA_valid)) 
Valid RI clusters: [0, 23, 5, 18, 35, 34, 16, 25, 9, 4, 2, 1, 6, 10, 11, 20, 14, 38, 41, 37, 3, 22, 13, 27, 24, 21, 40, 19, 17, 33, 36, 29, 8, 31, 44, 15, 28, 43, 7, 45, 12, 42]
Valid number of RI clusters: 42
Valid number of RTs categorized under RIs 308
In [24]:
# output to excel
output_path = os.path.expanduser("~/Documents/Coding/SAA_OA/data/RT_SAA_valid.xlsx")
RT_SAA_valid.to_excel(output_path, index=False)
print(f"Filtered RT_SAA_valid DataFrame saved to {output_path}")
Filtered RT_SAA_valid DataFrame saved to /Users/charlesli/Documents/Coding/SAA_OA/data/RT_SAA_valid.xlsx

Level 2 (Themes)¶

In [25]:
# For clustering themes, it's often useful to aggregate texts within each RI cluster.
# One simple method is to join RT texts per cluster and encode the aggregated text.
aggregated_texts = RT_SAA_valid.groupby('RI_cluster')['RT'].apply(lambda texts: " ".join(texts)).tolist()

# Get embeddings for aggregated texts (themes)
theme_embeddings = model.encode(aggregated_texts)

# Specify target number of themes (e.g., 15, but adjust if needed)
num_theme_clusters = None

clustering_theme = AgglomerativeClustering(
    n_clusters=num_theme_clusters,
    distance_threshold=1,
    linkage='ward'

)
theme_labels = clustering_theme.fit_predict(theme_embeddings)

# Map theme labels back to RI clusters
ri_to_theme = dict(zip(RT_SAA_valid['RI_cluster'].unique(), theme_labels))
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)


# --------------------------

# Step 2: Validate Themes (RIs to Themes)
# Only keep themes that have at least 2 unique RIs.
# --------------------------
# Count the unique RI clusters per theme in the valid RI clusters.
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(valid_theme_clusters)]





print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes", len(RT_SAA_valid_themes)) 
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique())) # 40
print("Number of valid_themes: ", len(valid_theme_clusters))
Step 2 - Valid themes (must have at least 2 unique RIs):
Valid number of RTs categorized under themes 256
Valid number of RI clusters categorized under themes: 34
Number of valid_themes:  9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/1094038717.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)

Fallback Technique (Compromising)¶

In [26]:
from numpy.linalg import norm

underpopulated_themes = {theme: count for theme, count in ri_counts_per_theme.items() if count < 2}

# For fallback, compute an average embedding for each theme.
# First create a mapping from each RI_cluster (key) to its embedding.
unique_ri_clusters = RT_SAA_valid['RI_cluster'].unique()
ri_embedding_dict = dict(zip(unique_ri_clusters, theme_embeddings))

# Group RI clusters by theme_cluster
theme_assignment = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].unique().to_dict()

# Calculate an aggregated embedding for each theme as the mean of its RI embeddings.
theme_embedding_dict = {}
for theme, ri_list in theme_assignment.items():
    emb_list = [ri_embedding_dict[ri] for ri in ri_list if ri in ri_embedding_dict]
    if emb_list:
        theme_embedding_dict[theme] = np.mean(emb_list, axis=0)

# Define a simple cosine distance function.
def cosine_distance(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-10)
In [27]:
# Set a merging threshold for fallback (adjust as appropriate)
MERGE_THRESHOLD = 0.6

# For each underpopulated (invalid) theme, check if a nearby valid theme exists.
for under_theme in list(underpopulated_themes.keys()):
    emb_under = theme_embedding_dict[under_theme]
    best_match = None
    best_distance = float("inf")
    # Compare against all valid themes.
    # all_themes = ri_counts_per_theme.index #(all theme clusters)
    for valid_theme in valid_theme_clusters: # 
        emb_valid = theme_embedding_dict[valid_theme]
        distance = cosine_distance(emb_under, emb_valid)
        if distance < best_distance:
            best_distance = distance
            best_match = valid_theme
    # If a sufficiently similar valid theme is found, reassign all RI clusters from the underpopulated theme.
    if best_distance < MERGE_THRESHOLD and best_match is not None:
        # Reassign RI clusters that were coded under the underpopulated theme to the best_match theme.
        for key, val in ri_to_theme.items():
            if val == under_theme:
                ri_to_theme[key] = best_match
        print(f"Merged underpopulated theme {under_theme} into theme {best_match} (distance={best_distance:.3f}).")
    else:
        print(f"No suitable merge found for underpopulated theme {under_theme}; it remains provisional.")

# Update the theme_cluster column using the updated mapping.
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)

# --------------------------
# (C) Final Validation: Only keep themes with at least 2 unique RI clusters.
# --------------------------
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
final_valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(final_valid_theme_clusters)]

# Print final validation results
print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes:", len(RT_SAA_valid_themes))
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique()))
print("Number of valid themes:", len(final_valid_theme_clusters))
Merged underpopulated theme 8 into theme 0 (distance=0.495).
No suitable merge found for underpopulated theme 9; it remains provisional.
Merged underpopulated theme 11 into theme 1 (distance=0.450).
Merged underpopulated theme 12 into theme 7 (distance=0.539).
No suitable merge found for underpopulated theme 13; it remains provisional.
Merged underpopulated theme 14 into theme 10 (distance=0.506).
Merged underpopulated theme 15 into theme 3 (distance=0.386).
Merged underpopulated theme 16 into theme 7 (distance=0.412).

Step 2 - Valid themes (must have at least 2 unique RIs):
Valid number of RTs categorized under themes: 297
Valid number of RI clusters categorized under themes: 40
Number of valid themes: 9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/47481451.py:28: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)

If we merge between the unvalid themes

In [28]:
# Set a merging threshold for fallback (adjust as appropriate)
MERGE_THRESHOLD = 0.6

# For each underpopulated (invalid) theme, check if a nearby valid theme exists.
for under_theme in list(underpopulated_themes.keys()):
    emb_under = theme_embedding_dict[under_theme]
    best_match = None
    best_distance = float("inf")
    # Compare against all valid themes.
    all_themes = ri_counts_per_theme.index #(all theme clusters)
    for theme in all_themes: # 
        if theme != under_theme:
            emb_all = theme_embedding_dict[theme]
            distance = cosine_distance(emb_under, emb_all)
            if distance < best_distance:
                best_distance = distance
                best_match = theme
    # If a sufficiently similar valid theme is found, reassign all RI clusters from the underpopulated theme.
    if best_distance < MERGE_THRESHOLD and best_match is not None:
        # Reassign RI clusters that were coded under the underpopulated theme to the best_match theme.
        for key, val in ri_to_theme.items():
            if val == under_theme:
                ri_to_theme[key] = best_match
        print(f"Merged underpopulated theme {under_theme} into theme {best_match} (distance={best_distance:.3f}).")
    else:
        print(f"No suitable merge found for underpopulated theme {under_theme}; it remains provisional.")

# Update the theme_cluster column using the updated mapping.
RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)

# --------------------------
# (C) Final Validation: Only keep themes with at least 2 unique RI clusters.
# --------------------------
ri_counts_per_theme = RT_SAA_valid.groupby('theme_cluster')['RI_cluster'].nunique()
final_valid_theme_clusters = ri_counts_per_theme[ri_counts_per_theme >= 2].index
RT_SAA_valid_themes = RT_SAA_valid[RT_SAA_valid['theme_cluster'].isin(final_valid_theme_clusters)]

# Print final validation results
print("\nStep 2 - Valid themes (must have at least 2 unique RIs):")
print("Valid number of RTs categorized under themes:", len(RT_SAA_valid_themes))
print("Valid number of RI clusters categorized under themes:", len(RT_SAA_valid_themes['RI_cluster'].unique()))
print("Number of valid themes:", len(final_valid_theme_clusters))
Merged underpopulated theme 8 into theme 0 (distance=0.495).
No suitable merge found for underpopulated theme 9; it remains provisional.
Merged underpopulated theme 11 into theme 1 (distance=0.450).
Merged underpopulated theme 12 into theme 7 (distance=0.539).
No suitable merge found for underpopulated theme 13; it remains provisional.
Merged underpopulated theme 14 into theme 10 (distance=0.506).
Merged underpopulated theme 15 into theme 3 (distance=0.386).
Merged underpopulated theme 16 into theme 7 (distance=0.412).

Step 2 - Valid themes (must have at least 2 unique RIs):
Valid number of RTs categorized under themes: 297
Valid number of RI clusters categorized under themes: 40
Number of valid themes: 9
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/4223298657.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RT_SAA_valid['theme_cluster'] = RT_SAA_valid['RI_cluster'].map(ri_to_theme)

Level 3 (Theoretical Constructs) (problematic)¶

In [29]:
# Aggregate RT texts within each valid theme cluster by joining texts from associated RIs.
aggregated_theme_texts = RT_SAA_valid_themes.groupby('theme_cluster')['RT'].apply(lambda texts: " ".join(texts)).tolist()

# Get embeddings for aggregated themes using your preloaded sentence transformer model.
tc_embeddings = model.encode(aggregated_theme_texts)

# Specify the target number of theoretical constructs, e.g., 5.
num_tc_clusters = 6

# Perform clustering on the aggregated theme embeddings.
clustering_tc = AgglomerativeClustering(
    n_clusters=num_tc_clusters,
    distance_threshold=None,
    linkage='ward'
)
tc_labels = clustering_tc.fit_predict(tc_embeddings)

# Map the generated TC labels back to the associated theme clusters.
# First, get a sorted list of unique theme clusters from RT_SAA_valid_themes.
unique_themes = sorted(RT_SAA_valid_themes['theme_cluster'].unique())
# Build a mapping: each unique theme gets a corresponding TC label.
theme_to_tc = dict(zip(unique_themes, tc_labels))

# Create a new column for theoretical construct labels.
RT_SAA_valid_themes['TC_cluster'] = RT_SAA_valid_themes['theme_cluster'].map(theme_to_tc)

# Validate these TC clusters by counting unique themes within each TC.
tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()
print("\nStep 3 - Preliminary TC clusters and their unique theme counts:")
print(tc_counts)

# Force each theoretical construct to have at least 2 unique themes.
# For any TC with less than 2 themes, apply a fallback strategy (here, we duplicate its theme list).
tc_groups = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].unique().to_dict()
final_tc_assignment = {}
for tc, themes in tc_groups.items():
    if len(themes) < 2:
        print(f"TC {tc} has only {len(themes)} unique theme(s). Forcing assignment.")
        # Force the TC to meet the minimum requirement – here we duplicate the theme.
        final_tc_assignment[tc] = list(themes) * 2
    else:
        final_tc_assignment[tc] = list(themes)

print("\nFinal TC assignments (each TC now forced to have at least 2 themes):")
print(final_tc_assignment)
Step 3 - Preliminary TC clusters and their unique theme counts:
TC_cluster
0    2
1    1
2    3
3    1
4    1
5    1
Name: theme_cluster, dtype: int64
TC 1 has only 1 unique theme(s). Forcing assignment.
TC 3 has only 1 unique theme(s). Forcing assignment.
TC 4 has only 1 unique theme(s). Forcing assignment.
TC 5 has only 1 unique theme(s). Forcing assignment.

Final TC assignments (each TC now forced to have at least 2 themes):
{0: [10, 0], 1: [7, 7], 2: [1, 3, 6], 3: [4, 4], 4: [2, 2], 5: [5, 5]}
/var/folders/pt/qhvg39g17wv22v0b1s3b9bv00000gn/T/ipykernel_96849/2795416904.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RT_SAA_valid_themes['TC_cluster'] = RT_SAA_valid_themes['theme_cluster'].map(theme_to_tc)

Fallback Technique (Compromising)¶

In [30]:
# Define cosine distance function
def cosine_distance(vec1, vec2):
    return 1 - np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-10)

# ---------------------------
# Step 1: Precompute Pairwise Theme Distances
# ---------------------------
def compute_pairwise_distances(theme_ids, theme_embedding_dict):
    pairwise_distances = {}
    for i, th1 in enumerate(theme_ids):
        for j in range(i + 1, len(theme_ids)):
            th2 = theme_ids[j]
            dist = cosine_distance(theme_embedding_dict[th1], theme_embedding_dict[th2])
            pairwise_distances[(th1, th2)] = dist
            pairwise_distances[(th2, th1)] = dist
    return pairwise_distances
In [ ]:
# Create a DataFrame from the tc_embeddings numpy array using the ri_counts_per_theme.index as the index
# tc_embeddings_df = pd.DataFrame(tc_embeddings, index=ri_counts_per_theme.index)
# tc_embeddings_df
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[31], line 2
      1 # Create a DataFrame from the tc_embeddings numpy array using the ri_counts_per_theme.index as the index
----> 2 tc_embeddings_df = pd.DataFrame(tc_embeddings, index=ri_counts_per_theme.index)
      3 tc_embeddings_df

File /opt/homebrew/lib/python3.10/site-packages/pandas/core/frame.py:827, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    816         mgr = dict_to_mgr(
    817             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
    818             # attribute "name"
   (...)
    824             copy=_copy,
    825         )
    826     else:
--> 827         mgr = ndarray_to_mgr(
    828             data,
    829             index,
    830             columns,
    831             dtype=dtype,
    832             copy=copy,
    833             typ=manager,
    834         )
    836 # For data is list-like, or Iterable (will consume into list)
    837 elif is_list_like(data):

File /opt/homebrew/lib/python3.10/site-packages/pandas/core/internals/construction.py:336, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    331 # _prep_ndarraylike ensures that values.ndim == 2 at this point
    332 index, columns = _get_axes(
    333     values.shape[0], values.shape[1], index=index, columns=columns
    334 )
--> 336 _check_values_indices_shape_match(values, index, columns)
    338 if typ == "array":
    339     if issubclass(values.dtype.type, str):

File /opt/homebrew/lib/python3.10/site-packages/pandas/core/internals/construction.py:420, in _check_values_indices_shape_match(values, index, columns)
    418 passed = values.shape
    419 implied = (len(index), len(columns))
--> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (9, 768), indices imply (11, 768)

Fallback Technique¶

In [ ]:
pairwise_distances = compute_pairwise_distances(tc_embeddings_df.index, theme_embedding_dict)
In [ ]:
# Count unique themes per TC
initial_tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()

# Identify underpopulated TCs (less than 3 themes)
underpopulated_tcs = initial_tc_counts[initial_tc_counts < 2].index.tolist()
populated_tcs = initial_tc_counts[initial_tc_counts >= 2].index.tolist()

redistribution_mapping = {}

# Iterate through underpopulated TCs
for under_tc in underpopulated_tcs:
    # Get themes in the underpopulated TC
    under_themes = RT_SAA_valid_themes[RT_SAA_valid_themes['TC_cluster'] == under_tc]['theme_cluster'].unique()
    
    # Find the closest populated TC for each theme in the underpopulated TC
    for theme in under_themes:
        best_target_tc = None
        best_distance = float('inf')
        
        for populated_tc in populated_tcs:
            target_themes = RT_SAA_valid_themes[RT_SAA_valid_themes['TC_cluster'] == populated_tc]['theme_cluster'].unique()
            
            # Calculate average distance between the theme and all themes in the populated TC
            distances = [pairwise_distances.get((theme, t), pairwise_distances.get((t, theme), float('inf'))) for t in target_themes]
            avg_distance = np.mean(distances) if distances else float('inf')
            
            if avg_distance < best_distance:
                best_distance = avg_distance
                best_target_tc = populated_tc
        
        # Assign the theme to the best target TC
        if best_target_tc is not None and best_distance <= MERGE_THRESHOLD:
            redistribution_mapping[theme] = best_target_tc

# Apply redistribution mapping to update TC assignments
for theme, new_tc in redistribution_mapping.items():
    RT_SAA_valid_themes.loc[RT_SAA_valid_themes['theme_cluster'] == theme, 'TC_cluster'] = new_tc

# ---------------------------
# Step 3: Verify Updated TC Distribution
# ---------------------------
updated_tc_counts = RT_SAA_valid_themes.groupby('TC_cluster')['theme_cluster'].nunique()
print("\nUpdated TC distribution:")
print(updated_tc_counts)

# Finalize the updated DataFrame
RT_SAA_valid_TC = RT_SAA_valid_themes.copy()
print("\nFinal RT_SAA_valid_TC DataFrame:")
print(RT_SAA_valid_TC)
Updated TC distribution:
TC_cluster
0    4
1    5
2    3
Name: theme_cluster, dtype: int64

Final RT_SAA_valid_TC DataFrame:
                        code  \
0      SAA > TC2 > T15 > RI6   
1    SAA > Orphan RIs > RI26   
2    SAA > Orphan RIs > RI26   
3     SAA > TC4 > T10 > RI34   
4    SAA > Orphan RIs > RI26   
..                       ...   
313   SAA > TC4 > T12 > RI21   
314   SAA > TC2 > T14 > RI23   
315    SAA > TC3 > T8 > RI29   
316    SAA > TC2 > T4 > RI49   
317   SAA > TC4 > T11 > RI39   

                                                    RT  characters  Beginning  \
0    My religion told me homosexuality was wrong, s...       351.0        1.0   
1    It was not “the next one” that saved me—it was...       247.0       13.0   
2    After all, my partner was at church and would ...       368.0        7.0   
3    A man entered the room and related that having...       353.0       29.0   
4    The lightning jolts of terror I registered the...       697.0        3.0   
..                                                 ...         ...        ...   
313  As I have maintained my commitment to recovery...       325.0        6.0   
314  But my acting out was getting worse rather tha...       286.0        3.0   
315  Since the day I admitted my powerlessness and ...       146.0        5.0   
316  I hoped that by praying and working my program...       172.0        3.0   
317  Eventually I met a woman and we began a commit...       204.0        7.0   

      End  narrative_num  RI program  coverage   TC  theme   ID  RI_cluster  \
0     1.0              4   6     SAA  0.039688  2.0   15.0    1          44   
1    13.0              4  26     SAA  0.027929  NaN    NaN    2          14   
2     7.0              4  26     SAA  0.041610  NaN    NaN    3           9   
3    29.0              2  34     SAA  0.022808  4.0   10.0    4           2   
4     4.0              2  26     SAA  0.045035  NaN    NaN    5          25   
..    ...            ...  ..     ...       ...  ...    ...  ...         ...   
313   6.0              5  21     SAA  0.048843  4.0   12.0  314           3   
314   3.0             17  23     SAA  0.074305  2.0   14.0  315          12   
315   5.0             17  29     SAA  0.037932  3.0    8.0  316          19   
316   3.0             17  49     SAA  0.044687  2.0    4.0  317          26   
317   7.0             17  39     SAA  0.053001  4.0   11.0  318           3   

     theme_cluster  TC_cluster  
0                1           2  
1               10           1  
2                0           0  
3                9           1  
4                3           0  
..             ...         ...  
313             19           1  
314              3           0  
315              0           0  
316             19           1  
317             19           1  

[308 rows x 15 columns]
In [ ]:
print("Final Cluster Overview")
print(len(RT_SAA)) # total Relevant texts
print(len(RT_SAA_valid)) # valid RI clusters
print(len(RT_SAA_valid_themes)) # valid themes
print(len(RT_SAA_valid_TC)) # valid TCs
Final Cluster Overview
318
308
308
308